In [425]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor,
    AdaBoostRegressor
)
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load Data
file1 = '/path/to/Gly-data y1 copy.csv'
file2 = '/path/to/Gly-data y2.csv'
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
data = pd.concat([df1, df2], ignore_index=True)

def clean_target(val):
    if str(val).strip() in ['xx', 'xx', 'xx']:
        return 0.01
    try:
        return float(val)
    except:
        return np.nan

data['y1'] = data['y1'].apply(clean_target)
data['y2'] = data['y2'].apply(clean_target)
data = data.dropna(subset=['y1', 'y2'], how='all')
data['y1'] = data['y1'].fillna(0.01)
data['y2'] = data['y2'].fillna(0.01)

X = data.drop(columns=['y1', 'y2'])
y1 = data['y1']
bool_cols = X.select_dtypes(include='bool').columns.tolist()
X[bool_cols] = X[bool_cols].astype(str)
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
X[categorical_cols] = X[categorical_cols].fillna('missing').astype(str)
X[numerical_cols] = X[numerical_cols].apply(pd.to_numeric, errors='coerce')
X[numerical_cols] = X[numerical_cols].fillna(X[numerical_cols].mean())

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, categorical_cols),
    ('num', num_pipeline, numerical_cols)
])

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    "CatBoost": CatBoostRegressor(n_estimators=100, verbose=0, random_state=42)
}

X_train, X_val, y1_train, y1_val = train_test_split(X, y1, test_size=0.2, random_state=42)

results = []
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y1_train)
    y1_train_pred = pipeline.predict(X_train)
    y1_val_pred = pipeline.predict(X_val)

    train_r2 = r2_score(y1_train, y1_train_pred)
    val_r2 = r2_score(y1_val, y1_val_pred)
    train_rmse = mean_squared_error(y1_train, y1_train_pred, squared=False)
    val_rmse = mean_squared_error(y1_val, y1_val_pred, squared=False)

    results.append({
        'Model': name,
        'Train R²': train_r2,
        'Validation R²': val_r2,
        'Train RMSE': train_rmse,
        'Validation RMSE': val_rmse
    })

plt.figure(figsize=(12, 6))
sns.barplot(data=results_df.melt(id_vars='Model', value_vars=['Train R²', 'Validation R²']),
            x='Model', y='value', hue='variable')
plt.title('A-Ratio: R² Scores for Train and Validation')
plt.ylabel('R² Score')
plt.xticks(rotation=45)
plt.legend(title='Dataset', loc='lower right')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [444]:
import matplotlib.pyplot as plt

fig_train, train_axes = plt.subplots(5, 2, figsize=(11, 16), dpi=600)
fig_val, val_axes = plt.subplots(5, 2, figsize=(11, 16), dpi=600)

train_axes = train_axes.flatten()
val_axes = val_axes.flatten()

models_y2 = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    "CatBoost": CatBoostRegressor(n_estimators=100, verbose=0, random_state=42)
}

y2 = data['y2']
X_train, X_val, y2_train, y2_val = train_test_split(X, y2, test_size=0.2, random_state=42)

for idx, (name, model) in enumerate(models_y2.items()):
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y2_train)
    y2_train_pred = pipeline.predict(X_train)
    y2_val_pred = pipeline.predict(X_val)

    train_r2 = r2_score(y2_train, y2_train_pred)
    train_rmse = mean_squared_error(y2_train, y2_train_pred, squared=False)
    val_r2 = r2_score(y2_val, y2_val_pred)
    val_rmse = mean_squared_error(y2_val, y2_val_pred, squared=False)

    ax_train = train_axes[idx]
    ax_train.scatter(y2_train, y2_train_pred, alpha=0.6, edgecolors='k')
    ax_train.plot([y2_train.min(), y2_train.max()], [y2_train.min(), y2_train.max()], 'r--')
    ax_train.set_title(f"{name} (Train)")
    ax_train.set_xlabel('True Yield')
    ax_train.set_ylabel('Predicted Yield')
    ax_train.legend([f"R² = {train_r2:.2f}\nRMSE = {train_rmse:.2f}"], loc='lower right')

    ax_val = val_axes[idx]
    ax_val.scatter(y2_val, y2_val_pred, alpha=0.6, edgecolors='k')
    ax_val.plot([y2_val.min(), y2_val.max()], [y2_val.min(), y2_val.max()], 'r--')
    ax_val.set_title(f"{name} (Validation)")
    ax_val.set_xlabel('True Yield')
    ax_val.set_ylabel('Predicted Yield')
    ax_val.legend([f"R² = {val_r2:.2f}\nRMSE = {val_rmse:.2f}"], loc='lower right')

fig_train.tight_layout()
fig_val.tight_layout()
fig_train.suptitle("Training: Actual vs Predicted Yield (y2)", fontsize=18, y=1.02)
fig_val.suptitle("Validation: Actual vs Predicted Yield (y2)", fontsize=18, y=1.02)
plt.show()
No description has been provided for this image
No description has been provided for this image